import pandas as pd
import numpy as np
import plotly.express as plx
from plotly.subplots import make_subplots
import plotly.graph_objects as go
df = pd.DataFrame(pd.read_excel('C:\\Users\\harip\\INEURON_PROJECTS\\Energy Efficiency\\energy+efficiency\\ENB2012_data.xlsx'))
df
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 2 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 3 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 4 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 5 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 0.90 | 563.5 | 318.5 | 122.50 | 7.0 | 2 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 0.64 | 784.0 | 343.0 | 220.50 | 3.5 | 5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 2 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 3 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 4 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 10 columns
df.isnull().sum()
X1 0 X2 0 X3 0 X4 0 X5 0 X6 0 X7 0 X8 0 Y1 0 Y2 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X1 768 non-null float64 1 X2 768 non-null float64 2 X3 768 non-null float64 3 X4 768 non-null float64 4 X5 768 non-null float64 5 X6 768 non-null int64 6 X7 768 non-null float64 7 X8 768 non-null int64 8 Y1 768 non-null float64 9 Y2 768 non-null float64 dtypes: float64(8), int64(2) memory usage: 60.1 KB
df.describe()
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.00000 | 768.000000 | 768.000000 | 768.00000 | 768.000000 | 768.000000 |
| mean | 0.764167 | 671.708333 | 318.500000 | 176.604167 | 5.25000 | 3.500000 | 0.234375 | 2.81250 | 22.307195 | 24.587760 |
| std | 0.105777 | 88.086116 | 43.626481 | 45.165950 | 1.75114 | 1.118763 | 0.133221 | 1.55096 | 10.090204 | 9.513306 |
| min | 0.620000 | 514.500000 | 245.000000 | 110.250000 | 3.50000 | 2.000000 | 0.000000 | 0.00000 | 6.010000 | 10.900000 |
| 25% | 0.682500 | 606.375000 | 294.000000 | 140.875000 | 3.50000 | 2.750000 | 0.100000 | 1.75000 | 12.992500 | 15.620000 |
| 50% | 0.750000 | 673.750000 | 318.500000 | 183.750000 | 5.25000 | 3.500000 | 0.250000 | 3.00000 | 18.950000 | 22.080000 |
| 75% | 0.830000 | 741.125000 | 343.000000 | 220.500000 | 7.00000 | 4.250000 | 0.400000 | 4.00000 | 31.667500 | 33.132500 |
| max | 0.980000 | 808.500000 | 416.500000 | 220.500000 | 7.00000 | 5.000000 | 0.400000 | 5.00000 | 43.100000 | 48.030000 |
plx.imshow(df.corr(),height=750,width=750,text_auto=True)
df.corr()['X6']
X1 4.678592e-17 X2 -3.459372e-17 X3 -2.429499e-17 X4 -5.830058e-17 X5 4.492205e-17 X6 1.000000e+00 X7 -9.406007e-16 X8 -2.549352e-16 Y1 -2.586763e-03 Y2 1.428960e-02 Name: X6, dtype: float64
df['X6'].value_counts()
2 192 3 192 4 192 5 192 Name: X6, dtype: int64
df['X8'].value_counts()
1 144 2 144 3 144 4 144 5 144 0 48 Name: X8, dtype: int64
plx.box(x = df['X8'],y=df['Y1'],color=df['X8'])
plx.box(x = df['X8'],y=df['Y2'],color=df['X8'])
plx.imshow(df.corr(),height=750,width=750,text_auto=True)
df.drop(['X6'],axis=1,inplace=True)
df
| X1 | X2 | X3 | X4 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 0.90 | 563.5 | 318.5 | 122.50 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 0.64 | 784.0 | 343.0 | 220.50 | 3.5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 9 columns
df.drop(['X1'],axis=1,inplace=True)
df
| X2 | X3 | X4 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 122.50 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 220.50 | 3.5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 8 columns
df.drop(['X4'],axis=1,inplace=True)
df
| X2 | X3 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 3.5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 7 columns
df.loc[(df['X8']>0), 'X8'] = 1
df
| X2 | X3 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 3.5 | 0.4 | 1 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.64 | 16.03 |
768 rows × 7 columns
plx.imshow(df.corr(),height=750,width=750,text_auto=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X2 768 non-null float64 1 X3 768 non-null float64 2 X5 768 non-null float64 3 X7 768 non-null float64 4 X8 768 non-null int64 5 Y1 768 non-null float64 6 Y2 768 non-null float64 dtypes: float64(6), int64(1) memory usage: 42.1 KB
plx.box(x = df['X8'],y=df['Y1'],color=df['X8'])
plx.box(x = df['X8'],y=df['Y1'],color=df['X8'])
len(df)
768
df.drop_duplicates(inplace=True)
df
| X2 | X3 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| 5 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 21.46 | 25.38 |
| 6 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 20.71 | 25.16 |
| 7 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 19.68 | 29.60 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 3.5 | 0.4 | 1 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 3.5 | 0.4 | 1 | 16.64 | 16.03 |
756 rows × 7 columns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
from tqdm import tqdm
X = df.drop(['Y1','Y2'],axis=1)
Y = df.drop(['X2','X3','X5','X7','X8'],axis=1)
lr_trn_score,rfr_trn_score,sgd_trn_score,en_trn_score,abr_trn_score,gbr_trn_score,svr_trn_score,xgb_trn_score,cbr_trn_score = [],[],[],[],[],[],[],[],[]
lr_test_score,rfr_test_score,sgd_test_score,en_test_score,abr_test_score,gbr_test_score,svr_test_score,xgb_test_score,cbr_test_score = [],[],[],[],[],[],[],[],[]
for i in tqdm(range(1000)):
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
lr = LinearRegression().fit(x_train, y_train)
pred = lr.predict(x_test)
pred_trn = lr.predict(x_train)
lr_test_score.append(r2_score(y_test, pred))
lr_trn_score.append(r2_score(y_train, pred_trn))
sgd = MultiOutputRegressor(SGDRegressor()).fit(x_train,y_train)
pred = sgd.predict(x_test)
pred_trn = sgd.predict(x_train)
sgd_test_score.append(r2_score(y_test, pred))
sgd_trn_score.append(r2_score(y_train, pred_trn))
en = ElasticNet().fit(x_train,y_train)
pred = en.predict(x_test)
pred_trn = en.predict(x_train)
en_test_score.append(r2_score(y_test, pred))
en_trn_score.append(r2_score(y_train, pred_trn))
abr = MultiOutputRegressor(AdaBoostRegressor()).fit(x_train,y_train)
pred = abr.predict(x_test)
pred_trn = abr.predict(x_train)
abr_test_score.append(r2_score(y_test, pred))
abr_trn_score.append(r2_score(y_train, pred_trn))
gbr = MultiOutputRegressor(GradientBoostingRegressor()).fit(x_train,y_train)
pred = gbr.predict(x_test)
pred_trn = gbr.predict(x_train)
gbr_test_score.append(r2_score(y_test, pred))
gbr_trn_score.append(r2_score(y_train, pred_trn))
svr = MultiOutputRegressor(SVR()).fit(x_train,y_train)
pred = svr.predict(x_test)
pred_trn = svr.predict(x_train)
svr_test_score.append(r2_score(y_test, pred))
svr_trn_score.append(r2_score(y_train, pred_trn))
xgb = MultiOutputRegressor(XGBRegressor()).fit(x_train,y_train)
pred = xgb.predict(x_test)
pred_trn = xgb.predict(x_train)
xgb_test_score.append(r2_score(y_test, pred))
xgb_trn_score.append(r2_score(y_train, pred_trn))
cbr = MultiOutputRegressor(CatBoostRegressor(verbose=0)).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
cbr_test_score.append(r2_score(y_test, pred))
cbr_trn_score.append(r2_score(y_train, pred_trn))
rfr = RandomForestRegressor().fit(x_train, y_train)
pred = rfr.predict(x_test)
pred_trn = lr.predict(x_train)
rfr_test_score.append(r2_score(y_test, pred))
rfr_trn_score.append(r2_score(y_train, pred_trn))
100%|██████████| 1000/1000 [25:04<00:00, 1.50s/it]
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = lr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = lr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on Linear Regression')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = sgd_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = sgd_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on SGDRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = en_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = en_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on ElasticNet Regression')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = abr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = abr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on AdaBoostRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = gbr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = gbr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on GradientBoostingRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = svr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = svr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on Support Vector Regressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = xgb_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = xgb_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on XGBRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = cbr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = cbr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on CatBoostRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = rfr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = rfr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on RandomForestRegressor')
fig.show()
X = df.drop(['Y1','Y2'],axis=1)
Y = df.drop(['X2','X3','X5','X7','X8'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = MultiOutputRegressor(CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100)).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.9846453823145591 0.9849155431050601
y1_pred,y2_pred = [],[]
for i in range(len(pred)):
y1_pred.append(pred[i][0])
y2_pred.append(pred[i][1])
def visulaize_performance_of_the_model(pred, y_test, modelname):
# Plotting both line & scatter plot in same graph of predicted values to check the performance of the model in visualization.
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(0,50), y=np.arange(0,50),
mode='lines',
name='perfectline'))
fig.add_trace(go.Scatter(x=pred, y=y_test,
mode='markers',
name='predictions'))
fig.update_layout(
title=f"Performance of {modelname} on Test data",
xaxis_title="Predicted",
yaxis_title="Actual",
font=dict(
family="Courier New, monospace",
size=13,
color="RebeccaPurple"
)
)
fig.show()
visulaize_performance_of_the_model(y1_pred, y_test['Y1'], 'CatBoost regressor')
visulaize_performance_of_the_model(y2_pred, y_test['Y2'], 'CatBoost regressor')
X = df.drop(['Y1','Y2'],axis=1)
Y = df['Y1']
lr_trn_score,rfr_trn_score,abr_trn_score,gbr_trn_score,xgb_trn_score,cbr_trn_score = [],[],[],[],[],[]
lr_test_score,rfr_test_score,abr_test_score,gbr_test_score,xgb_test_score,cbr_test_score = [],[],[],[],[],[]
for i in tqdm(range(1000)):
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
lr = LinearRegression().fit(x_train, y_train)
pred = lr.predict(x_test)
pred_trn = lr.predict(x_train)
lr_test_score.append(r2_score(y_test, pred))
lr_trn_score.append(r2_score(y_train, pred_trn))
abr = AdaBoostRegressor().fit(x_train,y_train)
pred = abr.predict(x_test)
pred_trn = abr.predict(x_train)
abr_test_score.append(r2_score(y_test, pred))
abr_trn_score.append(r2_score(y_train, pred_trn))
gbr = GradientBoostingRegressor().fit(x_train,y_train)
pred = gbr.predict(x_test)
pred_trn = gbr.predict(x_train)
gbr_test_score.append(r2_score(y_test, pred))
gbr_trn_score.append(r2_score(y_train, pred_trn))
xgb = XGBRegressor().fit(x_train,y_train)
pred = xgb.predict(x_test)
pred_trn = xgb.predict(x_train)
xgb_test_score.append(r2_score(y_test, pred))
xgb_trn_score.append(r2_score(y_train, pred_trn))
cbr = CatBoostRegressor(verbose=0).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
cbr_test_score.append(r2_score(y_test, pred))
cbr_trn_score.append(r2_score(y_train, pred_trn))
rfr = RandomForestRegressor().fit(x_train, y_train)
pred = rfr.predict(x_test)
pred_trn = lr.predict(x_train)
rfr_test_score.append(r2_score(y_test, pred))
rfr_trn_score.append(r2_score(y_train, pred_trn))
100%|██████████| 1000/1000 [12:30<00:00, 1.33it/s]
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = lr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = lr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on Linear Regression')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = abr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = abr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on AdaBoostRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = gbr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = gbr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on GradientBoostingRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = xgb_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = xgb_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on XGBRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = cbr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = cbr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on CatBoostRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = rfr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = rfr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on RandomForestRegressor')
fig.show()
X = df.drop(['Y1','Y2'],axis=1)
Y = df['Y2']
lr_trn_score,rfr_trn_score,abr_trn_score,gbr_trn_score,xgb_trn_score,cbr_trn_score = [],[],[],[],[],[]
lr_test_score,rfr_test_score,abr_test_score,gbr_test_score,xgb_test_score,cbr_test_score = [],[],[],[],[],[]
for i in tqdm(range(1000)):
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
lr = LinearRegression().fit(x_train, y_train)
pred = lr.predict(x_test)
pred_trn = lr.predict(x_train)
lr_test_score.append(r2_score(y_test, pred))
lr_trn_score.append(r2_score(y_train, pred_trn))
abr = AdaBoostRegressor().fit(x_train,y_train)
pred = abr.predict(x_test)
pred_trn = abr.predict(x_train)
abr_test_score.append(r2_score(y_test, pred))
abr_trn_score.append(r2_score(y_train, pred_trn))
gbr = GradientBoostingRegressor().fit(x_train,y_train)
pred = gbr.predict(x_test)
pred_trn = gbr.predict(x_train)
gbr_test_score.append(r2_score(y_test, pred))
gbr_trn_score.append(r2_score(y_train, pred_trn))
xgb = XGBRegressor().fit(x_train,y_train)
pred = xgb.predict(x_test)
pred_trn = xgb.predict(x_train)
xgb_test_score.append(r2_score(y_test, pred))
xgb_trn_score.append(r2_score(y_train, pred_trn))
cbr = CatBoostRegressor(verbose=0).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
cbr_test_score.append(r2_score(y_test, pred))
cbr_trn_score.append(r2_score(y_train, pred_trn))
rfr = RandomForestRegressor().fit(x_train, y_train)
pred = rfr.predict(x_test)
pred_trn = lr.predict(x_train)
rfr_test_score.append(r2_score(y_test, pred))
rfr_trn_score.append(r2_score(y_train, pred_trn))
100%|██████████| 1000/1000 [12:00<00:00, 1.39it/s]
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = lr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = lr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on Linear Regression')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = abr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = abr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on AdaBoostRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = gbr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = gbr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on GradientBoostingRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = xgb_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = xgb_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on XGBRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = cbr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = cbr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on CatBoostRegressor')
fig.show()
fig = make_subplots(rows = 2, cols = 1)
fig.append_trace(go.Scatter(y = rfr_test_score, name = 'Test Score'), row=1, col=1)
fig.append_trace(go.Scatter(y = rfr_trn_score, name = 'Train Score'), row=2, col=1)
fig.update_layout(title = 'Train vs Test Score on RandomForestRegressor')
fig.show()
X = df.drop(['Y2'],axis=1)
Y = df['Y2']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.9936020738750924 0.9823075865850464
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')
df['X8'].value_counts()
1 712 0 44 Name: X8, dtype: int64
df = pd.DataFrame(pd.read_excel('C:\\Users\\harip\\INEURON_PROJECTS\\Energy Efficiency\\energy+efficiency\\ENB2012_data.xlsx'))
df
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 2 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 3 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 4 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 0.98 | 514.5 | 294.0 | 110.25 | 7.0 | 5 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 0.90 | 563.5 | 318.5 | 122.50 | 7.0 | 2 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 0.64 | 784.0 | 343.0 | 220.50 | 3.5 | 5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 2 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 3 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 4 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 0.62 | 808.5 | 367.5 | 220.50 | 3.5 | 5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 10 columns
df.drop(['X1','X4','X6'],axis=1,inplace=True)
df
| X2 | X3 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 3.5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 3.5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 7 columns
X = df.drop(['Y1','Y2'],axis=1)
Y = df['Y2']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.9763488647084976 0.9515146871956497
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')
X = df.drop(['Y2'],axis=1)
Y = df['Y2']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.996604989482951 0.9764696781109573
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')
temp_df = df.drop(['X8'],axis=1)
temp_df
| X2 | X3 | X5 | X7 | Y1 | Y2 | |
|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 7.0 | 0.0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 7.0 | 0.0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 7.0 | 0.0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 7.0 | 0.0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 7.0 | 0.0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 3.5 | 0.4 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 3.5 | 0.4 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 3.5 | 0.4 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 3.5 | 0.4 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 3.5 | 0.4 | 16.64 | 16.03 |
768 rows × 6 columns
X = temp_df.drop(['Y2'],axis=1)
Y = temp_df['Y2']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.995509035148715 0.9747846196154121
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')
X = temp_df.drop(['Y1','Y2'],axis=1)
Y = temp_df['Y2']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.9724107776852532 0.9669833472207664
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')
df = pd.DataFrame(pd.read_excel('C:\\Users\\harip\\INEURON_PROJECTS\\Energy Efficiency\\energy+efficiency\\ENB2012_data.xlsx'))
temp_df = df.drop(['X1','X6'],axis=1)
temp_df
| X2 | X3 | X4 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 122.50 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 220.50 | 3.5 | 0.4 | 5 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 5 | 16.64 | 16.03 |
768 rows × 8 columns
temp_df.loc[(temp_df['X8'] > 0), 'X8']=1
temp_df
| X2 | X3 | X4 | X5 | X7 | X8 | Y1 | Y2 | |
|---|---|---|---|---|---|---|---|---|
| 0 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 1 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 2 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 3 | 514.5 | 294.0 | 110.25 | 7.0 | 0.0 | 0 | 15.55 | 21.33 |
| 4 | 563.5 | 318.5 | 122.50 | 7.0 | 0.0 | 0 | 20.84 | 28.28 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 784.0 | 343.0 | 220.50 | 3.5 | 0.4 | 1 | 17.88 | 21.40 |
| 764 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 1 | 16.54 | 16.88 |
| 765 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 1 | 16.44 | 17.11 |
| 766 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 1 | 16.48 | 16.61 |
| 767 | 808.5 | 367.5 | 220.50 | 3.5 | 0.4 | 1 | 16.64 | 16.03 |
768 rows × 8 columns
X = temp_df.drop(['Y2'],axis=1)
Y = temp_df['Y2']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.9936908558985426 0.9788257878445978
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')
X = temp_df.drop(['Y1','Y2'],axis=1)
Y = temp_df['Y1']
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)
cbr = CatBoostRegressor(verbose=0, n_estimators=10000,early_stopping_rounds=100).fit(x_train,y_train)
pred = cbr.predict(x_test)
pred_trn = cbr.predict(x_train)
print(r2_score(y_train, pred_trn), r2_score(y_test, pred))
0.9980610932677114 0.9974413562218192
visulaize_performance_of_the_model(pred, y_test, 'CatBoost regressor')